*******************************************************************
* This file generates the baseline dataset for demand estimation. *
*******************************************************************
* 1. Merge the data regarding distances between notaries and tracts with the original information on notaries and tracts.
* 2. The data is filtered to contain market-notary combinations which comply with the market catchment area definition. 

*************************
* 0 Set path to files *
*************************
clear all
capture log close
set more off
set trace off
cd $main_directory


***************************************************************************************************************************
* 1. Merge the data regarding distances between notaries and tracts with the original information on notaries and tracts. *
***************************************************************************************************************************
//Load distances between notaries and census tracts
use "Data_Belgium\Generated_data\distance_notary_Census_centroid_reduced.dta", clear
rename firm_ID off_id
* Notary data
merge m:1 off_id using "Data_Belgium\Generated_data\BE_notary_data.dta"
rename (NIS city) (nis_not city_not) 
drop if _merge!=3
drop _merge
* Market data
rename market_ID CENSUS_ID
decode CENSUS_ID, gen(CENSUS)
gen nis=substr(CENSUS,7,11)
destring nis, replace
merge m:1 nis using "Data_Belgium\Generated_data\BE_municipal_data.dta"
drop _merge
drop Income //This is the income based on regional rather than municipal data. Since it is less precise, we do not use this variable

rename off_id firm_ID
rename CENSUS_ID market_ID

format market_ID %24.0g //to more easily see this with browse
* Generating additional notary-level variables
format StartDate_off %td //start date of head office from Belfirst
format Time_since_operation1 %td //start date of notary from Belgian website (including predecessors)
replace StartDate_off=18993 if StartDate_off==. //arbitrarily at 1 Jan 2012
replace StartDate_off=14245 if StartDate_off<14245 //arbitarily truncate at 1 Jan 1999
gen start_not=(StartDate_off-14245)/365 //time since 1 Jan 1999 in decimals
replace NotPerOff=1 if NotPerOff==.
gen lnNotPerOff=log(NotPerOff)
gen LDE=Lf==5 //Legal Form is "LDE", I think non-limited liability firm, almost always single person offices

*************************************************************************************************************************
* 2. The data is filtered to contain market-notary combinations which comply with the market catchment area definition. *
*************************************************************************************************************************
* Create ranking for closest notaries in market
egen rankdist_market=rank(distance), by (market_ID) unique 
* Check that the market in which a notary is located is always in its catchment area (checking the geocoding).
gen common=(nis==nis_not) //nis is local market, nis_not is notary location
	//checking relation between market nis and notary nis
	egen test=sum(common),by(nis) //does local market have at least one notary?
	tab test if distance<15 //2,128 obs for which local market has no notary within 15km

tab arrondjud,gen(arr)

* Calculate market characteristics for the local market of the notary
foreach var of varlist popdens arrondjud GermanComm GermanExtra {
	gen temp = `var' if common==1
	egen `var'_not=mean(temp),by(firm_ID)
	drop temp
}

*use arrondjud_not to create dummy whether market is in same arrond as notary
gen arrondjud_same=(arrondjud==arrondjud_not)
*use GermanComm_not and GermanExtra_not to create dummy whether market can go outside its arrond to visit notary
	*GermanComm=1 is arrond Eupen, GermanExtra is part of arrond Liege near Eupen
gen arrondjud_out=0
replace arrondjud_out=1 if GermanComm==1&GermanExtra_not==1 //markets in Eupen can go to part of Liege
replace arrondjud_out=1 if GermanExtra==1&GermanComm_not==1 //markets in part of Liege can go to Eupen 

* Selecting the choice set
* Keep observations that are within 15 km of the municipal centroid, as well as the 5 nearest notaries.
drop if distance>15  & rankdist_market>5
* Since in Brussels the choice set is potentially very large, you can drop notaries who are not the 15 nearest
drop if arrond_str=="Arrondissement Brussel-Hoofdstad"&rankdist_market>15&common==0
* Drop notaries who are not in the same judician district
keep if arrondjud_same==1|arrondjud_out==1
*Given selected choice set, #markets from which firm can collect revenue (catchment area)
egen number_markets = count(market_ID), by (firm_ID)
*Given selected choice set, #firms that are in choice set for a market
egen number_firms = count(firm_ID), by (market_ID)

*******************
* Table 3 Panel C *
*******************
su distance, de
rename Population population

save "Data_Belgium\Generated_data\baseline_demand_dataset.dta", replace
